In [1]:
## Netflix Release Year
In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
In [2]:
from api_keys import netflix_api_key
In [3]:
# make paths and read csv raw data
data_path = "imdb.csv"

data = pd.read_csv(data_path)
In [4]:
data.head()
Out[4]:
Unnamed: 0 title year kind genre rating vote country language runtime cast director composer writer runtimes
0 0 Dinosaur Planet 2003.0 tv mini series ['Documentary', 'Animation', 'Family'] 7.7 474.0 ['United States'] ['English'] ['50'] ['Christian Slater', 'Scott Sampson'] NaN NaN ['Mike Carrol', 'Mike Carroll', 'Georgann Kane'] NaN
1 1 Character 2021.0 movie ['Crime', 'Horror', 'Thriller'] 8.3 46.0 ['Japan'] ['Japanese'] ['125'] ['Masaki Suda', 'Fukase', 'Mitsuki Takahata', ... ['Akira Nagai'] ['Youki Kojima'] ['Takashi Nagasaki', 'Takashi Nagasaki', 'Anna... NaN
2 2 Get Up and Dance! 1994.0 video movie ['Family'] 8.1 18.0 ['United States'] ['English'] ['54'] ['Paula Abdul', 'Aurorah Allain', 'Bill Bohl',... ['Steve Purcell'] NaN NaN NaN
3 3 The Rise and Fall of El Chapo 2016.0 tv movie ['Documentary'] 6.9 42.0 ['United States'] NaN ['85'] NaN NaN NaN NaN NaN
4 4 Sick - IMDb NaN NaN ['Thriller'] NaN NaN ['United States'] NaN NaN ['Marc Menchaca', 'Gideon Adlon', 'Dylan Spray... ['John Hyams'] NaN ['Katelyn Crabb', 'Kevin Williamson'] NaN
In [5]:
#get needed columns
year_data_df = data[['title','year','rating','vote']]

year_data_df.head()
Out[5]:
title year rating vote
0 Dinosaur Planet 2003.0 7.7 474.0
1 Character 2021.0 8.3 46.0
2 Get Up and Dance! 1994.0 8.1 18.0
3 The Rise and Fall of El Chapo 2016.0 6.9 42.0
4 Sick - IMDb NaN NaN NaN
In [6]:
#drop and null values
year_data_df = year_data_df.dropna()


year_data_df.head(20)
Out[6]:
title year rating vote
0 Dinosaur Planet 2003.0 7.7 474.0
1 Character 2021.0 8.3 46.0
2 Get Up and Dance! 1994.0 8.1 18.0
3 The Rise and Fall of El Chapo 2016.0 6.9 42.0
5 8 Man 1992.0 5.5 93.0
6 What the #$*! Do We (K)now!? 2004.0 5.3 13432.0
7 Class of Nuke 'Em High Part II: Subhumanoid Me... 1991.0 4.5 2177.0
8 The Fighter 2010.0 7.8 351199.0
11 Neil Diamond: Greatest Hits Live 1988.0 8.0 81.0
12 7 Seconds 2005.0 4.8 7153.0
13 By Dawn's Early Light 1990.0 7.1 2898.0
14 Seeta Aur Geeta 1972.0 6.8 1818.0
15 Strange Relations 2001.0 7.6 768.0
16 Sesame Street Presents: The Street We Live On 2004.0 7.2 67.0
17 Lilo and Stitch 2012.0 7.2 22.0
18 Boycott 2001.0 7.2 901.0
19 Meat Loaf: Bat Out of Hell 1999.0 7.7 286.0
20 Aqua Teen Hunger Force 2000.0 7.6 23785.0
21 FernGully 2: The Magical Rescue 1998.0 4.6 1484.0
22 Lady Chatterley 1993.0 6.9 1605.0
In [7]:
# dropping ALL duplicate values
year_data_df.drop_duplicates(subset ="title", keep = 'first', inplace = True)
In [8]:
#check for earliest date
earliestyear = year_data_df['year'].min()
earliestyear
Out[8]:
1914.0
In [9]:
#create the bins to separate the age group and label the bins
bins = [0, 1939.9, 1949.9, 1959.9, 1969.9, 1979.9, 1989.9, 1999.9, 2009.9, 2019.9, 3000]
bin_names = ["Before 1940", "1940-1950", "1950-1960", "1960-1970", "1970-1980", "1980-1990", "1990-2000", "2000-2010", "2010-2020", "Post 2020"]
In [10]:
#add column to dataframe that describes that shows bins
year_data_df["Year Group"] = pd.cut(year_data_df["year"], bins, labels=bin_names, include_lowest=True)
year_data_df.head(20)
Out[10]:
title year rating vote Year Group
0 Dinosaur Planet 2003.0 7.7 474.0 2000-2010
1 Character 2021.0 8.3 46.0 Post 2020
2 Get Up and Dance! 1994.0 8.1 18.0 1990-2000
3 The Rise and Fall of El Chapo 2016.0 6.9 42.0 2010-2020
5 8 Man 1992.0 5.5 93.0 1990-2000
6 What the #$*! Do We (K)now!? 2004.0 5.3 13432.0 2000-2010
7 Class of Nuke 'Em High Part II: Subhumanoid Me... 1991.0 4.5 2177.0 1990-2000
8 The Fighter 2010.0 7.8 351199.0 2010-2020
11 Neil Diamond: Greatest Hits Live 1988.0 8.0 81.0 1980-1990
12 7 Seconds 2005.0 4.8 7153.0 2000-2010
13 By Dawn's Early Light 1990.0 7.1 2898.0 1990-2000
14 Seeta Aur Geeta 1972.0 6.8 1818.0 1970-1980
15 Strange Relations 2001.0 7.6 768.0 2000-2010
16 Sesame Street Presents: The Street We Live On 2004.0 7.2 67.0 2000-2010
17 Lilo and Stitch 2012.0 7.2 22.0 2010-2020
18 Boycott 2001.0 7.2 901.0 2000-2010
19 Meat Loaf: Bat Out of Hell 1999.0 7.7 286.0 1990-2000
20 Aqua Teen Hunger Force 2000.0 7.6 23785.0 2000-2010
21 FernGully 2: The Magical Rescue 1998.0 4.6 1484.0 1990-2000
22 Lady Chatterley 1993.0 6.9 1605.0 1990-2000
In [11]:
#GRAPH FOR DISTRIBUTION OF YEARS
#group by year group
yeargroup_df = year_data_df.groupby(['Year Group'])

ygcount_df = pd.DataFrame(yeargroup_df['year'].count())

#create bar chart
yeargroup_bar = ygcount_df.plot(kind="bar", title="Distribution of Movies by Year", color="crimson", legend=False)

#label the chart
yeargroup_bar.set_xlabel("Years")
yeargroup_bar.set_ylabel("Number of Movie")
plt.tight_layout

plt.show
Out[11]:
<function matplotlib.pyplot.show(close=None, block=None)>
In [12]:
#sort data by rating
topmovies_df = year_data_df.sort_values("rating", ascending=False)

topmovies_df.head()
Out[12]:
title year rating vote Year Group
3848 Dragon Family 2004.0 9.6 8.0 2000-2010
2021 Mortal Kombat: At The Movies 2012.0 9.5 6.0 2010-2020
7655 Band of Brothers 2001.0 9.4 398551.0 2000-2010
2906 The Sopranos: Season 6 Invitation to the Set 2005.0 9.4 47.0 2000-2010
1185 Widespread Panic: The Earth Will Swallow You 2002.0 9.3 13.0 2000-2010
In [13]:
#GRAPH FOR AVERAGE VOTES PER MOVIE BY YEAR
topyear_df = topmovies_df.groupby(['Year Group'])

tycount_df = pd.DataFrame(topyear_df['vote'].mean())

#create bar chart
top25_bar = tycount_df.plot(kind="bar", title="Average Votes per Movie by Year", color="crimson", legend=False)

#label the chart
top25_bar.set_xlabel("Years")
top25_bar.set_ylabel("Average Votes per Movie")
plt.tight_layout

plt.show
Out[13]:
<function matplotlib.pyplot.show(close=None, block=None)>
In [14]:
top25_df = topmovies_df[:25]

top25_df
Out[14]:
title year rating vote Year Group
3848 Dragon Family 2004.0 9.6 8.0 2000-2010
2021 Mortal Kombat: At The Movies 2012.0 9.5 6.0 2010-2020
7655 Band of Brothers 2001.0 9.4 398551.0 2000-2010
2906 The Sopranos: Season 6 Invitation to the Set 2005.0 9.4 47.0 2000-2010
1185 Widespread Panic: The Earth Will Swallow You 2002.0 9.3 13.0 2000-2010
698 Yanni: Live at the Acropolis 1994.0 9.3 410.0 1990-2000
652 The Shawshank Redemption 1994.0 9.3 2461873.0 1990-2000
2807 Queensrÿche: Operation Livecrime 1991.0 9.3 146.0 1990-2000
282 Blue Planet II 2017.0 9.3 36474.0 2010-2020
2600 Pride 25: Body Blow 2003.0 9.3 8.0 2000-2010
3360 311: Live in Concert, New Orleans - 3-11 Day 2004 2004.0 9.3 113.0 2000-2010
1224 Smallville Season 3 Promo 2003.0 9.2 58.0 2000-2010
9183 Depeche Mode: Devotional 1993.0 9.2 914.0 1990-2000
7475 The World at War 1973.0 9.2 23729.0 1970-1980
4662 Behind the Scenes: One Tree Hill Season 6 2009.0 9.2 43.0 2000-2010
1657 The Godfather 1972.0 9.2 1702698.0 1970-1980
7065 Roy Orbison: Black and White Night 30 2017.0 9.2 52.0 2010-2020
753 Dream Theater: Live at Budokan 2004.0 9.2 713.0 2000-2010
6047 The Lex Series Laptop 2019.0 9.2 8.0 2010-2020
5317 Baseball 1994.0 9.2 4028.0 1990-2000
7187 Nine Inch Nails Live: And All That Could Have ... 2002.0 9.2 1251.0 2000-2010
4812 Game of Thrones 2011.0 9.2 1873254.0 2010-2020
3812 Pizza Delivery/Home Sweet Pineapple 1999.0 9.2 1441.0 1990-2000
6558 Selena Live: The Last Concert 1995.0 9.2 175.0 1990-2000
2277 Carly Simon Live from Martha's Vineyard 1987.0 9.1 17.0 1980-1990
In [15]:
## Netflix Kind Values
In [16]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
In [17]:
netflix_pd = pd.read_csv("imdb.csv")
In [18]:
# remove duplicate movies
netflix_pd.drop_duplicates(subset ="title", keep = "first", inplace = True)
In [19]:
# clean and drop NAN in reduced data set
netflix_reduced = netflix_pd.loc[:, ["kind", "rating", "vote"]]
netflix_clean = netflix_reduced.dropna(how="any")
In [20]:
# assign kind values and unique labels
kind_unique = netflix_clean["kind"].unique()
kind_values = netflix_clean["kind"].value_counts()
In [21]:
# graph all the kinds distribution
# define parameters of the graph
labels = ['movie', 'tv short', 'video movie', 'tv movie', 'tv series', 'episode', 'tv mini series', 'video game']
sizes = [5213, 10, 1191, 744, 583, 469, 255, 17]
seperate = (0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01)
In [22]:
# plot the pie chart
plt.figure(figsize=(10,10))
plt.pie(sizes, explode=seperate, autopct="%1.1f%%", labels=labels)
plt.legend(loc="upper right")
plt.axis("equal")
plt.title("Netflix Kind Distribution")
plt.show()
In [23]:
movie_rating = netflix_clean.loc[netflix_clean["kind"] == "movie", "rating"]
video_rating = netflix_clean.loc[netflix_clean["kind"] == "video movie", "rating"]
movie_rating_mean = movie_rating.mean()
video_rating_mean = video_rating.mean()
In [24]:
# define parameters
labels = ["Digital Movies", "Video Movies"]
ratings = [movie_rating_mean, video_rating_mean]
In [30]:
# graph bar chart
plt.bar(labels, ratings, color="red", alpha=0.5, align="center")
plt.title("Digital Movie vs Video Movie Ratings")
plt.xlabel("Movie Type")
plt.ylabel("IMDB Ratings")
plt.ylim(0,10)
plt.show()
In [31]:
## Netflix Genre Types
In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
In [33]:
netflix_pd = pd.read_csv("Netflix_Data.csv")
In [34]:
data_path = "Netflix_Data.csv"
data = pd.read_csv(data_path)
In [35]:
df = pd.read_csv("Netflix_Data.csv")
df.head()
Out[35]:
Title Genre Tags Languages Series or Movie Hidden Gem Score Country Availability Runtime Director Writer ... Netflix Release Date Production House Netflix Link IMDb Link Summary IMDb Votes Image Poster TMDb Trailer Trailer Site
0 Lets Fight Ghost Crime, Drama, Fantasy, Horror, Romance Comedy Programmes,Romantic TV Comedies,Horror ... Swedish, Spanish Series 4.3 Thailand < 30 minutes Tomas Alfredson John Ajvide Lindqvist ... 2021-03-04 Canal+, Sandrew Metronome https://www.netflix.com/watch/81415947 https://www.imdb.com/title/tt1139797 A med student with a supernatural gift tries t... 205926.0 https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... https://m.media-amazon.com/images/M/MV5BOWM4NT... NaN NaN
1 HOW TO BUILD A GIRL Comedy Dramas,Comedies,Films Based on Books,British English Movie 7.0 Canada 1-2 hour Coky Giedroyc Caitlin Moran ... 2021-03-04 Film 4, Monumental Pictures, Lionsgate https://www.netflix.com/watch/81041267 https://www.imdb.com/title/tt4193072 When nerdy Johanna moves to London, things get... 2838.0 https://occ-0-1081-999.1.nflxso.net/dnm/api/v6... https://m.media-amazon.com/images/M/MV5BZGUyN2... https://www.youtube.com/watch?v=eIbcxPy4okQ YouTube
2 Centigrade Drama, Thriller Thrillers English Movie 6.4 Canada 1-2 hour Brendan Walsh Brendan Walsh, Daley Nixon ... 2021-03-04 NaN https://www.netflix.com/watch/81305978 https://www.imdb.com/title/tt8945942 Trapped in a frozen car during a blizzard, a p... 1720.0 https://occ-0-1081-999.1.nflxso.net/dnm/api/v6... https://m.media-amazon.com/images/M/MV5BODM2MD... https://www.youtube.com/watch?v=0RvV7TNUlkQ YouTube
3 ANNE+ Drama TV Dramas,Romantic TV Dramas,Dutch TV Shows Turkish Series 7.7 Belgium,Netherlands < 30 minutes NaN NaN ... 2021-03-04 NaN https://www.netflix.com/watch/81336456 https://www.imdb.com/title/tt6132758 Upon moving into a new place, a 20-something r... 1147.0 https://occ-0-1489-1490.1.nflxso.net/dnm/api/v... https://m.media-amazon.com/images/M/MV5BNWRkMz... NaN NaN
4 Moxie Animation, Short, Drama Social Issue Dramas,Teen Movies,Dramas,Comedie... English Movie 8.1 Lithuania,Poland,France,Iceland,Italy,Spain,Gr... 1-2 hour Stephen Irwin NaN ... 2021-03-04 NaN https://www.netflix.com/watch/81078393 https://www.imdb.com/title/tt2023611 Inspired by her moms rebellious past and a con... 63.0 https://occ-0-4039-1500.1.nflxso.net/dnm/api/v... https://m.media-amazon.com/images/M/MV5BODYyNW... NaN NaN

5 rows × 29 columns

In [36]:
df.isnull().sum()
Out[36]:
Title                        0
Genre                     1710
Tags                        67
Languages                 1935
Series or Movie              0
Hidden Gem Score          2101
Country Availability        19
Runtime                      1
Director                  4708
Writer                    4330
Actors                    1925
View Rating               7024
IMDb Score                2099
Rotten Tomatoes Score     9098
Metacritic Score         11144
Awards Received           9405
Awards Nominated For      7819
Boxoffice                11473
Release Date              2107
Netflix Release Date         0
Production House         10331
Netflix Link                 0
IMDb Link                 2303
Summary                      9
IMDb Votes                2101
Image                        0
Poster                    3638
TMDb Trailer              8286
Trailer Site              8286
dtype: int64
In [37]:
df = df.drop(columns = [ 'Metacritic Score', 'Boxoffice', 'Production House', 'Netflix Link', 'IMDb Link',
        'Poster', 'TMDb Trailer', 'Trailer Site'], axis = 1)
In [38]:
df['Release Date']= pd.to_datetime(df['Release Date'])
df['Netflix Release Date']= pd.to_datetime(df['Netflix Release Date'])
In [39]:
df['Released_Year'] = pd.DatetimeIndex(df['Release Date']).year
df['Released_Year_Net'] = pd.DatetimeIndex(df['Netflix Release Date']).year
In [38]:
colors = ['black',] * 2
colors[0] = 'crimson'

count = df['Series or Movie'].value_counts()

fig = go.Figure(data=[go.Bar(
    x = df["Series or Movie"],
    y = count,
    text = count,
    textposition='auto',
    marker_color=colors # marker color can be a single color value or an iterable
)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title_text= 'Movie or Tv Series ?',
                  uniformtext_minsize=8, uniformtext_mode='hide',
                  barmode='group', xaxis_tickangle=-45,
                  yaxis=dict(
                  title='Quantity',
                  titlefont_size=14),
                  xaxis=dict(
                  title='Category',
                  titlefont_size=14))
In [40]:
df_movie = df[df['Series or Movie']=='Movie']
df_movie.head(1)
Out[40]:
Title Genre Tags Languages Series or Movie Hidden Gem Score Country Availability Runtime Director Writer ... Rotten Tomatoes Score Awards Received Awards Nominated For Release Date Netflix Release Date Summary IMDb Votes Image Released_Year Released_Year_Net
1 HOW TO BUILD A GIRL Comedy Dramas,Comedies,Films Based on Books,British English Movie 7.0 Canada 1-2 hour Coky Giedroyc Caitlin Moran ... 79.0 1.0 NaN 2020-05-08 2021-03-04 When nerdy Johanna moves to London, things get... 2838.0 https://occ-0-1081-999.1.nflxso.net/dnm/api/v6... 2020.0 2021

1 rows × 23 columns

In [41]:
df_series = df[df["Series or Movie"] == "Series"]
df_series.head(1)
Out[41]:
Title Genre Tags Languages Series or Movie Hidden Gem Score Country Availability Runtime Director Writer ... Rotten Tomatoes Score Awards Received Awards Nominated For Release Date Netflix Release Date Summary IMDb Votes Image Released_Year Released_Year_Net
0 Lets Fight Ghost Crime, Drama, Fantasy, Horror, Romance Comedy Programmes,Romantic TV Comedies,Horror ... Swedish, Spanish Series 4.3 Thailand < 30 minutes Tomas Alfredson John Ajvide Lindqvist ... 98.0 74.0 57.0 2008-12-12 2021-03-04 A med student with a supernatural gift tries t... 205926.0 https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... 2008.0 2021

1 rows × 23 columns

In [42]:
df_series_gen = df_series.dropna(subset=['Genre'])
In [43]:
colors_10 = ['DarkRed', 'FireBrick','Red', 'Crimson', 'IndianRed', 'slategray', 'gray', 'dimgrey', 'DarkSlateGrey', 'black']
series_gen_list = df_series_gen.Genre.str.split(',') #split the list into names
s_gen_list = {} #create an empty list
for genres in series_gen_list: # for any names in series_gen_list
    for genre in genres: # for any genre in genres
        if (genre in s_gen_list): #if this genre is already present in the s_gen_list
            s_gen_list[genre]+=1 # increase his value
        else:  # else
            s_gen_list[genre]=1 # Create his index in the list
s_gen_df = pd.DataFrame(s_gen_list.values(),index = s_gen_list.keys(),
                        columns = {'Counts of Genres in Tv Series'}) #Create a s_gen_df
s_gen_df.sort_values(by = 'Counts of Genres in Tv Series',ascending = False,inplace = True) #Sort the dataframe in ascending order
top_10_s_gen = s_gen_df[0:10] 
In [45]:
fig = go.Figure(data=[go.Bar(
    x = top_10_s_gen.index,
    y = top_10_s_gen['Counts of Genres in Tv Series'],
    text = top_10_s_gen['Counts of Genres in Tv Series'],
    textposition='auto',
    marker_color=colors_10 # marker color can be a single color value or an iterable
)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title_text= 'Most Popular in TV Genre',
                  uniformtext_minsize=8, uniformtext_mode='hide',
                  yaxis=dict(
                  title='Quantity',
                  titlefont_size=14),
                  xaxis=dict(
                  title='Genres',
                  titlefont_size=14))
In [46]:
df_movie_gen = df_movie.dropna(subset=['Genre'])
In [47]:
movie_gen_list = df_movie_gen.Genre.str.split(', ') #split the list into names
m_gen_list = {} #create an empty list
for genres in movie_gen_list: # for any genres in movie_gen_list
    for genre in genres: # for any genre in genres
        if (genre in m_gen_list): #if this name is already present in the m_gen_list
            m_gen_list[genre]+=1 # increase his value
        else:  # else
            m_gen_list[genre]=1 # Create his index in the list
m_gen_df = pd.DataFrame(m_gen_list.values(),index = m_gen_list.keys(),
                        columns = {'Counts of Genres in Movies'}) #Create a m_gen_df
m_gen_df.sort_values(by = 'Counts of Genres in Movies',ascending = False,inplace = True) #Sort the dataframe in ascending order
top_10_m_gen = m_gen_df[0:10] 
In [49]:
fig = go.Figure(data=[go.Bar(
    x = top_10_m_gen.index,
    y = top_10_m_gen['Counts of Genres in Movies'],
    text = top_10_m_gen['Counts of Genres in Movies'],
    textposition='auto',
    marker_color=colors_10 # marker color can be a single color value or an iterable
)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title_text= 'Most Popular Movie Genre',
                  uniformtext_minsize=8, uniformtext_mode='hide',
                  yaxis=dict(
                  title='Quantity',
                  titlefont_size=14),
                  xaxis=dict(
                  title='Genres',
                  titlefont_size=14))
In [50]:
df_series_imdb = df_series.dropna(subset=['IMDb Score'])
df_series_imdb = df_series_imdb.sort_values(by = 'IMDb Score', ascending = False)
top_s_imdb_10_list =df_series_imdb[:10]
In [54]:
fig = go.Figure(data=[go.Bar(
    x = top_s_imdb_10_list['Title'],
    y = top_s_imdb_10_list['IMDb Score'],
    text = top_s_imdb_10_list['IMDb Score'],
    textposition='auto',
    marker_color=colors_10 # marker color can be a single color value or an iterable
)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title_text= 'Top Rated Tv Series Rated by IMDB Score?',
                  uniformtext_minsize=8, uniformtext_mode='hide',
                  yaxis=dict(
                  title='IMDb Score',
                  titlefont_size=14),
                  xaxis=dict(
                  title='Titles',
                  titlefont_size=14))
In [52]:
df_movie_imdb = df_movie.dropna(subset=['IMDb Score'])
df_movie_imdb = df_movie_imdb.sort_values(by = 'IMDb Score', ascending = False)
top_m_imdb_10_list = df_movie_imdb[:10]
In [55]:
fig = go.Figure(data=[go.Bar(
    x = top_m_imdb_10_list['Title'],
    y = top_m_imdb_10_list['IMDb Score'],
    text = top_m_imdb_10_list['IMDb Score'],
    textposition='auto',
    marker_color=colors_10 # marker color can be a single color value or an iterable
)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title_text= 'Top Rated Movies Rated by IMDB Rating',
                  uniformtext_minsize=8, uniformtext_mode='hide',
                  yaxis=dict(
                  title='IMDb Score',
                  titlefont_size=14),
                  xaxis=dict(
                  title='Titles',
                  titlefont_size=14))
In [56]:
##Country of Orgin
In [63]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AffinityPropagation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import plotly as py
import plotly.graph_objs as go
import os
py.offline.init_notebook_mode(connected = True)
#print(os.listdir("../input"))
import datetime as dt
import missingno as msno
plt.rcParams['figure.dpi'] = 140
In [64]:
df = pd.read_csv('netflix_titles.csv')
df.head(3)
Out[64]:
show_id type title director cast country date_added release_year rating duration listed_in description
0 s1 TV Show 3% NaN João Miguel, Bianca Comparato, Michel Gomes, R... Brazil August 14, 2020 2020 TV-MA 4 Seasons International TV Shows, TV Dramas, TV Sci-Fi &... In a future where the elite inhabit an island ...
1 s2 Movie 7:19 Jorge Michel Grau Demián Bichir, Héctor Bonilla, Oscar Serrano, ... Mexico December 23, 2016 2016 TV-MA 93 min Dramas, International Movies After a devastating earthquake hits Mexico Cit...
2 s3 Movie 23:59 Gilbert Chan Tedd Chan, Stella Chung, Henley Hii, Lawrence ... Singapore December 20, 2018 2011 R 78 min Horror Movies, International Movies When an army recruit is found dead, his fellow...
In [65]:
# Missing data

for i in df.columns:
    null_rate = df[i].isna().sum() / len(df) * 100 
    if null_rate > 0 :
        print("{} null rate: {}%".format(i,round(null_rate,2)))
director null rate: 30.68%
cast null rate: 9.22%
country null rate: 6.51%
date_added null rate: 0.13%
rating null rate: 0.09%
In [66]:
# Replacments

df['country'] = df['country'].fillna(df['country'].mode()[0])


df['cast'].replace(np.nan, 'No Data',inplace  = True)
df['director'].replace(np.nan, 'No Data',inplace  = True)

# Drops

df.dropna(inplace=True)

# Drop Duplicates

df.drop_duplicates(inplace= True)
In [67]:
df.isnull().sum()
Out[67]:
show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64
In [68]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7770 entries, 0 to 7786
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       7770 non-null   object
 1   type          7770 non-null   object
 2   title         7770 non-null   object
 3   director      7770 non-null   object
 4   cast          7770 non-null   object
 5   country       7770 non-null   object
 6   date_added    7770 non-null   object
 7   release_year  7770 non-null   int64 
 8   rating        7770 non-null   object
 9   duration      7770 non-null   object
 10  listed_in     7770 non-null   object
 11  description   7770 non-null   object
dtypes: int64(1), object(11)
memory usage: 789.1+ KB
In [69]:
df["date_added"] = pd.to_datetime(df['date_added'])

df['month_added']=df['date_added'].dt.month
df['month_name_added']=df['date_added'].dt.month_name()
df['year_added'] = df['date_added'].dt.year

df.head(3)
Out[69]:
show_id type title director cast country date_added release_year rating duration listed_in description month_added month_name_added year_added
0 s1 TV Show 3% No Data João Miguel, Bianca Comparato, Michel Gomes, R... Brazil 2020-08-14 2020 TV-MA 4 Seasons International TV Shows, TV Dramas, TV Sci-Fi &... In a future where the elite inhabit an island ... 8 August 2020
1 s2 Movie 7:19 Jorge Michel Grau Demián Bichir, Héctor Bonilla, Oscar Serrano, ... Mexico 2016-12-23 2016 TV-MA 93 min Dramas, International Movies After a devastating earthquake hits Mexico Cit... 12 December 2016
2 s3 Movie 23:59 Gilbert Chan Tedd Chan, Stella Chung, Henley Hii, Lawrence ... Singapore 2018-12-20 2011 R 78 min Horror Movies, International Movies When an army recruit is found dead, his fellow... 12 December 2018
In [70]:
# Helper column for various plots
df['count'] = 1

# Many productions have several countries listed - this will skew our results , we'll grab the first one mentioned

# Lets retrieve just the first country
df['first_country'] = df['country'].apply(lambda x: x.split(",")[0])
df['first_country'].head()

# Rating ages from this notebook: https://www.kaggle.com/andreshg/eda-beginner-to-expert-plotly (thank you!)

ratings_ages = {
    'TV-PG': 'Older Kids',
    'TV-MA': 'Adults',
    'TV-Y7-FV': 'Older Kids',
    'TV-Y7': 'Older Kids',
    'TV-14': 'Teens',
    'R': 'Adults',
    'TV-Y': 'Kids',
    'NR': 'Adults',
    'PG-13': 'Teens',
    'TV-G': 'Kids',
    'PG': 'Older Kids',
    'G': 'Kids',
    'UR': 'Adults',
    'NC-17': 'Adults'
}

df['target_ages'] = df['rating'].replace(ratings_ages)
df['target_ages'].unique()

# Genre

df['genre'] = df['listed_in'].apply(lambda x :  x.replace(' ,',',').replace(', ',',').split(',')) 

# Reducing name length

df['first_country'].replace('United States', 'USA', inplace=True)
df['first_country'].replace('United Kingdom', 'UK',inplace=True)
df['first_country'].replace('South Korea', 'S. Korea',inplace=True)
In [71]:
data = df.groupby('first_country')['count'].sum().sort_values(ascending=False)[:10]

# Plot

color_map = ['#f5f5f1' for _ in range(10)]
color_map[0] = color_map[1] = color_map[2] =  '#b20710' # color highlight

fig, ax = plt.subplots(1,1, figsize=(12, 6))
ax.bar(data.index, data, width=0.5, 
       edgecolor='darkgray',
       linewidth=0.6,color=color_map)

#annotations
for i in data.index:
    ax.annotate(f"{data[i]}", 
                   xy=(i, data[i] + 150), #i like to change this to roughly 5% of the highest cat
                   va = 'center', ha='center',fontweight='light', fontfamily='serif')



# Remove border from plot

for s in ['top', 'left', 'right']:
    ax.spines[s].set_visible(False)
    
# Tick labels

ax.set_xticklabels(data.index, fontfamily='serif', rotation=0)

# Title and sub-title

fig.text(0.09, 1, 'Top 10 countries on Netflix', fontsize=15, fontweight='bold', fontfamily='serif')
fig.text(0.09, 0.95, 'The three most frequent countries have been highlighted.', fontsize=12, fontweight='light', fontfamily='serif')

fig.text(1.1, 1.01, 'Insight', fontsize=15, fontweight='bold', fontfamily='serif')

fig.text(1.1, 0.67, '''
The most prolific producers of
content for Netflix are, primarily,
the USA, with India and the UK
a significant distance behind.

It makes sense that the USA produces 
the most content as, afterall, 
Netflix is a US company.
'''
         , fontsize=12, fontweight='light', fontfamily='serif')

ax.grid(axis='y', linestyle='-', alpha=0.4)   

grid_y_ticks = np.arange(0, 4000, 500) # y ticks, min, max, then step
ax.set_yticks(grid_y_ticks)
ax.set_axisbelow(True)

#Axis labels

#plt.xlabel("Country", fontsize=12, fontweight='light', fontfamily='serif',loc='left',y=-1.5)
#plt.ylabel("Count", fontsize=12, fontweight='light', fontfamily='serif')
 #plt.legend(loc='upper right')
    
# thicken the bottom line if you want to
plt.axhline(y = 0, color = 'black', linewidth = 1.3, alpha = .7)

ax.tick_params(axis='both', which='major', labelsize=12)


import matplotlib.lines as lines
l1 = lines.Line2D([1, 1], [0, 1], transform=fig.transFigure, figure=fig,color='black',lw=0.2)
fig.lines.extend([l1])

ax.tick_params(axis=u'both', which=u'both',length=0)

plt.show()
In [75]:
country_order = df['first_country'].value_counts()[:11].index
data_q2q3 = df[['type', 'first_country']].groupby('first_country')['type'].value_counts().unstack().loc[country_order]
data_q2q3['sum'] = data_q2q3.sum(axis=1)
data_q2q3_ratio = (data_q2q3.T / data_q2q3['sum']).T[['Movie', 'TV Show']].sort_values(by='Movie',ascending=False)[::-1]




###
fig, ax = plt.subplots(1,1,figsize=(15, 8),)

ax.barh(data_q2q3_ratio.index, data_q2q3_ratio['Movie'], 
        color='#b20710', alpha=0.8, label='Movie')
ax.barh(data_q2q3_ratio.index, data_q2q3_ratio['TV Show'], left=data_q2q3_ratio['Movie'], 
        color='#221f1f', alpha=0.8, label='TV Show')


ax.set_xlim(0, 1)
ax.set_xticks([])
ax.set_yticklabels(data_q2q3_ratio.index, fontfamily='serif', fontsize=11)

# male percentage
for i in data_q2q3_ratio.index:
    ax.annotate(f"{data_q2q3_ratio['Movie'][i]*100:.3}%", 
                   xy=(data_q2q3_ratio['Movie'][i]/2, i),
                   va = 'center', ha='center',fontsize=12, fontweight='light', fontfamily='serif',
                   color='white')

for i in data_q2q3_ratio.index:
    ax.annotate(f"{data_q2q3_ratio['TV Show'][i]*100:.3}%", 
                   xy=(data_q2q3_ratio['Movie'][i]+data_q2q3_ratio['TV Show'][i]/2, i),
                   va = 'center', ha='center',fontsize=12, fontweight='light', fontfamily='serif',
                   color='white')
    

fig.text(0.13, 0.93, 'Top 10 countries Movie & TV Show split', fontsize=15, fontweight='bold', fontfamily='serif')   
fig.text(0.131, 0.89, 'Percent Stacked Bar Chart', fontsize=12,fontfamily='serif')   

for s in ['top', 'left', 'right', 'bottom']:
    ax.spines[s].set_visible(False)
    
#ax.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.06))

fig.text(0.75,0.9,"Movie", fontweight="bold", fontfamily='serif', fontsize=15, color='#b20710')
fig.text(0.81,0.9,"|", fontweight="bold", fontfamily='serif', fontsize=15, color='black')
fig.text(0.82,0.9,"TV Show", fontweight="bold", fontfamily='serif', fontsize=15, color='#221f1f')


fig.text(1.1, 0.93, 'Insight', fontsize=15, fontweight='bold', fontfamily='serif')

fig.text(1.1, 0.44, '''
Interestingly, Netflix in India
is made up nearly entirely of Movies. 

Bollywood is big business, and perhaps
the main focus of this industry is Movies
and not TV Shows.

South Korean Netflix on the other hand is 
almost entirely TV Shows.

The underlying resons for the difference 
in content must be due to market research
conducted by Netflix.
'''
         , fontsize=12, fontweight='light', fontfamily='serif')



import matplotlib.lines as lines
l1 = lines.Line2D([1, 1], [0, 1], transform=fig.transFigure, figure=fig,color='black',lw=0.2)
fig.lines.extend([l1])




ax.tick_params(axis='both', which='major', labelsize=12)
ax.tick_params(axis=u'both', which=u'both',length=0)

plt.show()
In [76]:
##Language
In [77]:
netflix_pd = pd.read_csv("NetflixOriginals.csv")
In [78]:
data_path = "NetflixOriginals.csv"
data = pd.read_csv(data_path)
In [79]:
df = pd.read_csv("NetflixOriginals.csv")
df.head()
Out[79]:
Title Genre Premiere Runtime IMDB Score Language
0 Enter the Anime Documentary 5-Aug-19 58 2.5 English/Japanese
1 Dark Forces Thriller 21-Aug-20 81 2.6 Spanish
2 The App Science fiction/Drama 26-Dec-19 79 2.6 Italian
3 The Open House Horror thriller 19-Jan-18 94 3.2 English
4 Kaali Khuhi Mystery 30-Oct-20 90 3.4 Hindi
In [80]:
common_languages=netflix_pd['Language'].value_counts().reset_index(name='total')
language_list=common_languages[common_languages['total']>=3]['index']
In [82]:
common_languages[common_languages['total']>3].plot.bar(x='index', y='total',rot=90)
Out[82]:
<AxesSubplot:xlabel='index'>
In [83]:
## Netflix Runtime
In [84]:
netflix_pd = pd.read_csv("IMDB-Movie-Data.csv")
netflix_pd.head()
Out[84]:
Rank Title Genre Description Director Actors Year Runtime (Minutes) Rating Votes Revenue (Millions) Metascore
0 1 Guardians of the Galaxy Action,Adventure,Sci-Fi A group of intergalactic criminals are forced ... James Gunn Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... 2014 121 8.1 757074 333.13 76.0
1 2 Prometheus Adventure,Mystery,Sci-Fi Following clues to the origin of mankind, a te... Ridley Scott Noomi Rapace, Logan Marshall-Green, Michael Fa... 2012 124 7.0 485820 126.46 65.0
2 3 Split Horror,Thriller Three girls are kidnapped by a man with a diag... M. Night Shyamalan James McAvoy, Anya Taylor-Joy, Haley Lu Richar... 2016 117 7.3 157606 138.12 62.0
3 4 Sing Animation,Comedy,Family In a city of humanoid animals, a hustling thea... Christophe Lourdelet Matthew McConaughey,Reese Witherspoon, Seth Ma... 2016 108 7.2 60545 270.32 59.0
4 5 Suicide Squad Action,Adventure,Fantasy A secret government agency recruits some of th... David Ayer Will Smith, Jared Leto, Margot Robbie, Viola D... 2016 123 6.2 393727 325.02 40.0
In [85]:
netflix_pd.drop_duplicates(subset ="Title", keep = "first", inplace = True)
In [86]:
netflix_runtime = netflix_pd[["Title","Year","Rating","Runtime (Minutes)"]]
In [87]:
netflix_runtime = netflix_runtime.dropna(how="any")
In [88]:
bins = [0, 59.99, 74.99, 89.99, 104.99, 119.99, 134.99, 149.99, 300]
bin_names = ["Less than 60", "60-75", "75-90", "90-105", "105-120", "120-135", "135-150", "More than 150"]
In [89]:
netflix_pd["Runtime Group (Minutes)"] = pd.cut(netflix_runtime["Runtime (Minutes)"], bins, labels=bin_names, include_lowest=True)
In [90]:
# theres a better way to do this BUT...
# get the mean rating of each bin
less_hour_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "Less than 60", "Rating"]
hour_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "60-75", "Rating"]
hour_15_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "75-90", "Rating"]
hour_30_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "90-105", "Rating"]
hour_45_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "105-120", "Rating"]
two_hour_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "120-135", "Rating"]
two_hour_15_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "135-150", "Rating"]
over_hour_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "More than 150", "Rating"]
In [91]:
less_hour_rating_mean = less_hour_rating.mean()
hour_rating_mean = hour_rating.mean()
hour_15_rating_mean = hour_15_rating.mean()
hour_30_rating_mean = hour_30_rating.mean()
hour_45_rating_mean = hour_45_rating.mean()
two_hour_rating_mean = two_hour_rating.mean()
two_hour_15_rating_mean = two_hour_15_rating.mean()
over_hour_rating_mean = over_hour_rating.mean()
In [92]:
# define parameters
labels = ["Less than 60", "60-75", "75-90", "90-105", "105-120", "120-135", "135-150", "More than 150"]
ratings = [0, hour_rating_mean, hour_15_rating_mean, hour_30_rating_mean, hour_45_rating_mean, two_hour_rating_mean, two_hour_15_rating_mean, over_hour_rating_mean]
In [93]:
# graph bar chart
plt.bar(labels, ratings, color="red", alpha=0.5, align="center")
plt.title("Runtime Ratings")
plt.xlabel("Runtime Bins (Minutes)")
plt.ylabel("Ratings")
plt.ylim(0, 10)
plt.xticks(rotation=75)
Out[93]:
([0, 1, 2, 3, 4, 5, 6, 7],
 [Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, ''),
  Text(0, 0, '')])
In [ ]:
 
In [ ]: